{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1 Import Pakages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import requests,time,os,csv,re,logging\n",
    "from bs4 import BeautifulSoup\n",
    "import numpy as np\n",
    "import traceback\n",
    "\n",
    "from multiprocessing import Pool"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 功能模块"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 `requests` 获取html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_html(url):\n",
    "    try:\n",
    "        r=requests.get(url,timeout=16)\n",
    "        status=r.status_code\n",
    "        r.raise_for_status()\n",
    "        r.encoding=r.apparent_encoding\n",
    "        return r.text\n",
    "    except:\n",
    "        now_time=time.strftime('%Y%m%d%H%M%S')\n",
    "        logging.info(now_time +'      ' +'get_html 异常, ' '    url：'+ url)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 尝试获取 `n` 次"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_n_times(page_url,n=5):\n",
    "    #获取页面失败或服务器错误是，重试n次\n",
    "    i=1\n",
    "    sleep_time=0.1\n",
    "    index_page=get_html(page_url)\n",
    "    \n",
    "    while index_page==None or \"小差\" in index_page:\n",
    "        if i > n:\n",
    "            break\n",
    "        else:\n",
    "            time.sleep(sleep_time)\n",
    "            index_page=get_html(page_url)\n",
    "            sleep_time+=0.1\n",
    "            i+=1                \n",
    "                               \n",
    "    return index_page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.3 获取某 `tag` 下总页数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_total_page(tag_url):\n",
    "    page_url=tag_url+'d1'\n",
    "    index_page=get_n_times(page_url)\n",
    "    \n",
    "    if index_page==None or \"小差\" in index_page:\n",
    "        total_page=101\n",
    "    else:\n",
    "        soup=BeautifulSoup(index_page,'html.parser')\n",
    "        taged_total_num=soup.find('div',attrs={'class':'search-result'}).span.string\n",
    "        total_page=int(taged_total_num)//30+1\n",
    "        if total_page>100:\n",
    "            total_page=100\n",
    "    return total_page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.4 页面解析\n",
    "- `BeautifulSoup` 页面内容提取\n",
    "- `csvwriter` 写入文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def parse_index_page(date,index_page,tag,page_num,total_page):\n",
    "    path=os.path.join(os.getcwd(),'results/%s_index_info_all.csv')  %date #L2：两室，P21:200万以下\n",
    "    fieldnames=['date','tag','taged_total_num','total_page','page_num','on_sale_num','saled_in_90d','last_day_viewed',\n",
    "                'key','title','total_price','total_price_unit','per_price','per_price_unit','prop1']\n",
    "    if  not os.path.exists(path):     #如果文件不存在，创建文件并写入表头\n",
    "        csv_file=open(path,'w',newline='')\n",
    "        writer=csv.DictWriter(csv_file,fieldnames=fieldnames)\n",
    "        writer.writeheader()\n",
    "        csv_file.close()\n",
    "        \n",
    "    csv_file=open(path,'a+',newline='')\n",
    "    writer=csv.DictWriter(csv_file,fieldnames=fieldnames)\n",
    "    \n",
    "    item={}\n",
    "    \n",
    "    soup=BeautifulSoup(index_page,'html.parser')\n",
    "    \n",
    "    #get_basic_info\n",
    "    \n",
    "    taged_total_num=soup.find('div',attrs={'class':'search-result'}).span.string\n",
    "    strong_num=soup.find_all('span',attrs={'class':'num strong-num'})\n",
    "    on_sale_num=strong_num[0].string                #在售总数\n",
    "    saled_in_90d=strong_num[1].string               #近90天成交\n",
    "    last_day_viewed=strong_num[2].string            #昨天带看次数\n",
    "    \n",
    "    \n",
    "    \n",
    "    #update_basic_info\n",
    "    item.update({'date':date,'tag':tag,'taged_total_num':taged_total_num,'total_page':total_page,'page_num':page_num,\n",
    "                 'on_sale_num':on_sale_num,'saled_in_90d':saled_in_90d,\n",
    "                'last_day_viewed':last_day_viewed})\n",
    "    \n",
    "    \n",
    "    index_info=soup.find_all('a',attrs={'class':'text link-hover-green js_triggerGray js_fanglist_title'})\n",
    "    for a in index_info:\n",
    "        key=a['key']\n",
    "        title=a['title']\n",
    "        item.update({'key':key,'title':title})\n",
    "        \n",
    "        price_div=a.parent.next_sibling.next_sibling\n",
    "        total_price=price_div.find('span',attrs={'class':'total-price strong-num'}).string\n",
    "        total_price_unit=price_div.find('span',attrs={'class':'unit'}).string\n",
    "        per_price_info=price_div.find('span',attrs={'class':'info-col price-item minor'}).string\n",
    "        per_price_info=re.search('(\\d+)(\\S{3})',per_price_info).groups()\n",
    "        per_price=per_price_info[0]\n",
    "        per_price_unit=per_price_info[1]\n",
    "        \n",
    "        prop_div=price_div.next_sibling.next_sibling\n",
    "        props=prop_div.find_all('span',attrs={'class':'c-prop-tag2'})\n",
    "        \n",
    "        if len(props)>0:\n",
    "            if \"距离\" in props[0].string:\n",
    "                prop1=props[0].string\n",
    "            else:\n",
    "                prop1=\"\"\n",
    "        else:\n",
    "            prop1=\"\"\n",
    "            \n",
    "#         prop2=props[1].string\n",
    "#         prop3=props[2].string\n",
    "        \n",
    "        item.update({'total_price':total_price,'total_price_unit':total_price_unit,'per_price':per_price,\n",
    "                    'per_price_unit':per_price_unit,'prop1':prop1})\n",
    "        \n",
    "        writer.writerow(item)    #定义writer2csv函数，每次调用函数写入item,结果是重复写入最后一个item，改用writer写入，无问题\n",
    "        \n",
    "    csv_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.5 按序爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_page_info(date,tag,page_num,total_page,error_list=[]):\n",
    "    now_time=time.strftime('%Y%m%d%H%M%S')\n",
    "    page_url=base_url+tag+'d'+str(page_num)\n",
    "    \n",
    "    index_page=get_n_times(page_url)\n",
    "    \n",
    "    if index_page!=None:\n",
    "        try:\n",
    "            parse_index_page(date,index_page,tag,page_num,total_page)\n",
    "        except:\n",
    "            error_list.append(tag+ 't' + str(total_page) + 'd' + str(page_num))\n",
    "            logging.info(now_time +'   ' +'Parse Error：'+tag+ '  ' + str(total_page) + '  '+'d' + str(page_num))\n",
    "            exit()\n",
    "\n",
    "    else:\n",
    "        error_list.append(tag+ 't' + str(total_page) + 'd' + str(page_num))\n",
    "        logging.info(now_time +'   ' +'get None：'+tag+ '  ' + str(total_page) + '  '+'d' + str(page_num))\n",
    "    \n",
    "    return error_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3 初始化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "base_url='http://sh.lianjia.com/ershoufang/'\n",
    "date=time.strftime('%Y%m%d')\n",
    "\n",
    "log_dir=os.path.join(os.getcwd(),'results/get_index.log')\n",
    "logging.basicConfig(filename=log_dir,level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# filter_tag=['p21l1','p21l6']      #['p21l1','p21l2','p21l3','p21l4','p21l6']\n",
    "# p_tag=['p22','p23','p24','p25','p26','p27']\n",
    "# l_tag=['l1','l2','l3','l4','l5','l6']\n",
    "# for p in p_tag:\n",
    "#     for l in l_tag:\n",
    "#         filter_tag.append(p+l)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<span class=\"burk\">TEST</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filter_tag=['p21l1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "error_list=[]\n",
    "error_tag=[]\n",
    "\n",
    "start_tag='p21l1'\n",
    "start_index=filter_tag.index(start_tag)\n",
    "start_page=1\n",
    "\n",
    "sleep_time=0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4 爬取框架"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1 主任务"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "tick=time.time()\n",
    "for tag_i in range(start_index,len(filter_tag)):\n",
    "    tag=filter_tag[tag_i]\n",
    "    tag_url=base_url+tag\n",
    "    total_page=get_total_page(tag_url)\n",
    "    \n",
    "    if total_page==101:\n",
    "        error_tag.append(tag)\n",
    "        print('Tag Error: ' + str(len(error_tag)))\n",
    "        continue\n",
    "    \n",
    "    if start_tag!=tag:\n",
    "        start_page=1\n",
    "    \n",
    "    for page_num in range(start_page,total_page+1):\n",
    "        get_page_info(date,tag,page_num,total_page,error_list)\n",
    "    \n",
    "    print(tag+' done, Page Error:'+str(len(error_list)))\n",
    "\n",
    "tock=time.time()\n",
    "time_cost=(tock-tick)/60\n",
    "time_cost=\"%.2f\"    %time_cost\n",
    "print('time cost: %.2f min'  %((tock-tick)/60))   #”%((tock-tick)/60)“需在print函数内部，否则出错"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Time record\n",
    "时间  |  错误数量\n",
    ":------|----:\n",
    "(min)|\n",
    "4.89|23\n",
    "4.83|6\n",
    "4.95|4\n",
    "3.04|6\n",
    "1.96|2\n",
    "2.25|7\n",
    "3.26|2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### kernal died error\n",
    "The kernel appears to have died. It will restart automatically.\n",
    "- `conda update mkl`\n",
    "- `pip install mkl_service-1.1.2-cp35-cp35m-win_amd64.whl`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 `error_tag` 再爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'error_tag' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-d1e98a5cad6c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mwhile\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merror_tag\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m>\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m     \u001b[0mold_error_tag\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merror_tag\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m     \u001b[0merror_tag\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mtag\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mold_error_tag\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'error_tag' is not defined"
     ]
    }
   ],
   "source": [
    "while len(error_tag)>0:\n",
    "    old_error_tag=error_tag\n",
    "    error_tag=[]\n",
    "    \n",
    "    for tag in old_error_tag:\n",
    "        tag_url=base_url+tag\n",
    "        total_page=get_total_page(tag_url)\n",
    "        \n",
    "        if total_page==101:\n",
    "            error_tag.append(tag)\n",
    "            continue\n",
    "            \n",
    "        for page_num in range(1,total_page+1):\n",
    "            get_page_info(date,tag,page_num,total_page,error_list)\n",
    "            time.sleep(sleep_time)   #获取每页后休眠\n",
    "    print(\"number or error tag:\" + str(len(error_tag)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.3 `error_list` 再爬取\n",
    "- 爬取 `error_list`\n",
    "- 循环至 `error_list` 为空"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "while len(error_list)>0:\n",
    "    old_error_list=error_list\n",
    "    error_list=[]\n",
    "    for item in old_error_list:\n",
    "        tag=item[:5]   #返回：p21l2\n",
    "        total_page=re.search('t\\d*',item).group()[1:]     #返回：纯数字\n",
    "        page_num=re.search('d\\d*',item).group()[1:]       #返回：纯数字\n",
    "        get_page_info(date,tag,page_num,total_page,error_list)\n",
    "    print('number of error page: '+str(len(error_list)))        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {
    "height": "684px",
    "left": "0px",
    "right": "1382px",
    "top": "107px",
    "width": "218px"
   },
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
